{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups  # 从sklearn.datasets里导入新闻数据抓取器 fetch_20newsgroups\n",
    "from sklearn.model_selection import  train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer  # 从sklearn.feature_extraction.text里导入文本特征向量化模块\n",
    "from sklearn.naive_bayes import MultinomialNB     # 从sklean.naive_bayes里导入朴素贝叶斯模型\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading 20news dataset. This may take a few minutes.\n",
      "Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18846\n",
      "The Accuracy of Naive Bayes Classifier is: 0.8397707979626485\n",
      "                          precision    recall  f1-score   support\n",
      "\n",
      "             alt.atheism       0.86      0.86      0.86       201\n",
      "           comp.graphics       0.59      0.86      0.70       250\n",
      " comp.os.ms-windows.misc       0.89      0.10      0.17       248\n",
      "comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240\n",
      "   comp.sys.mac.hardware       0.93      0.78      0.85       242\n",
      "          comp.windows.x       0.82      0.84      0.83       263\n",
      "            misc.forsale       0.91      0.70      0.79       257\n",
      "               rec.autos       0.89      0.89      0.89       238\n",
      "         rec.motorcycles       0.98      0.92      0.95       276\n",
      "      rec.sport.baseball       0.98      0.91      0.95       251\n",
      "        rec.sport.hockey       0.93      0.99      0.96       233\n",
      "               sci.crypt       0.86      0.98      0.91       238\n",
      "         sci.electronics       0.85      0.88      0.86       249\n",
      "                 sci.med       0.92      0.94      0.93       245\n",
      "               sci.space       0.89      0.96      0.92       221\n",
      "  soc.religion.christian       0.78      0.96      0.86       232\n",
      "      talk.politics.guns       0.88      0.96      0.92       251\n",
      "   talk.politics.mideast       0.90      0.98      0.94       231\n",
      "      talk.politics.misc       0.79      0.89      0.84       188\n",
      "      talk.religion.misc       0.93      0.44      0.60       158\n",
      "\n",
      "               micro avg       0.84      0.84      0.84      4712\n",
      "               macro avg       0.86      0.84      0.82      4712\n",
      "            weighted avg       0.86      0.84      0.82      4712\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#1.数据获取\n",
    "news = fetch_20newsgroups(subset='all')\n",
    "print(len(news.data))  # 输出数据的条数：18846\n",
    "\n",
    "#2.数据预处理：训练集和测试集分割，文本特征向量化\n",
    "X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) # 随机采样25%的数据样本作为测试集\n",
    "#print X_train[0]  #查看训练样本\n",
    "#print y_train[0:100]  #查看标签\n",
    "\n",
    "#文本特征向量化\n",
    "vec = CountVectorizer()\n",
    "X_train = vec.fit_transform(X_train)\n",
    "X_test = vec.transform(X_test)\n",
    "\n",
    "#3.使用朴素贝叶斯进行训练\n",
    "mnb = MultinomialNB()   # 使用默认配置初始化朴素贝叶斯\n",
    "mnb.fit(X_train,y_train)    # 利用训练数据对模型参数进行估计\n",
    "y_predict = mnb.predict(X_test)     # 对参数进行预测\n",
    "\n",
    "#4.获取结果报告\n",
    "print('The Accuracy of Naive Bayes Classifier is:', mnb.score(X_test,y_test))\n",
    "print(classification_report(y_test, y_predict, target_names = news.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
